Takeda data with the additional NK pool in the sample.
However,
the added NK cells don’t have any tags.
I tried to classify the
additional NK cells and endogenous NK cells using the ML algorithms as
well as to find the features that separate them.
Note about data scaling/transformation methods
| Term | Description | R Code Example |
|---|---|---|
| center | Centers the data to the mean, shifting the data such that the center of each column is located at the origin by subtracting the mean value of each column. | data <- scale(data, center = TRUE, scale = FALSE) |
| scale | Standardizes the data by dividing each column by its standard deviation, adjusting the scale of the data. | data <- scale(data, center = FALSE, scale = TRUE) |
| YeoJohnson | Applies the Yeo-Johnson transformation, which is designed to maintain a normal distribution of the data. | library(car); data <- YeoJohnson(data) |
| nzv (Near Zero Variance) | Removes columns (features) with almost no variance, identifying and excluding features where the variance is close to zero. | library(caret); nzv <- nearZeroVar(data); data <- data[, -nzv] |
VariableFeatures(obj.srt) %>% head()
## [1] "CCL4L2" "HIST1H4C" "HBB" "CXCL8" "SPP1" "HBA2"
cat("The number of variable features is ", length(VariableFeatures(obj.srt)), "\n")
## The number of variable features is 3000
# Replace "-" with "_"
VariableFeatures(obj.srt) <- gsub("-", "_", VariableFeatures(obj.srt))
selected_treatments = c("CTL","NKC")
cells = obj.srt@meta.data[obj.srt@meta.data$sample %in% selected_treatments,] %>% rownames()
df= obj.srt@meta.data %>% filter(sample %in% selected_treatments) %>% select(c("orig.ident","sample"))
df$sample =factor(df$sample, levels = c("CTL","NKC"))
df %>% select(sample) %>% table()
## sample
## CTL NKC
## 325 922
# df
Features : Variable features (3000)
Gene expression : normalized data not scaled (scale will be done
later)
Cells : Cells from CTL and NKC samples
exp.mtx= data.frame(obj.srt@assays$RNA@data, check.names = F)[VariableFeatures(obj.srt),cells] %>% t() %>% data.frame(check.names = F)
exp.mtx %>% dim()
## [1] 1247 3000
set.seed(1234)
train_prop = 0.75
cell_split= initial_split(df, prop = train_prop)
cell_training=training(cell_split)
cell_test=testing(cell_split)
cell_split
cell_training
cell_test
cell_training$sample %>% table()
cell_test$sample %>% table()
cell_training
exp.mtx
train_set <- merge(cell_training, exp.mtx, by = "row.names")
train_set[1:3,1:8]
## Row.names orig.ident sample CCL4L2 HIST1H4C HBB CXCL8
## 1 TN30293_AAACGCTGTTCAAAGA-2 30293 NKC 0.000000 0.000000 0 2.366233
## 2 TN30293_AAAGAACCAATCGCAT-2 30293 NKC 3.845169 0.000000 0 0.000000
## 3 TN30293_AAAGGTAAGATGAATC-2 30293 NKC 3.556894 4.585522 0 0.000000
## SPP1
## 1 0
## 2 0
## 3 0
test_set <- merge(cell_test, exp.mtx, by = "row.names")
test_set[1:3,1:8]
## Row.names orig.ident sample CCL4L2 HIST1H4C HBB CXCL8
## 1 TN30293_AAACCCAAGATCCTAC-2 30293 NKC 0.000000 0.000000 0 0.00000
## 2 TN30293_AAAGAACAGCACTCGC-2 30293 NKC 0.000000 2.082862 0 0.00000
## 3 TN30293_AACAAAGCAGGAGGTT-2 30293 NKC 3.627605 0.000000 0 2.21248
## SPP1
## 1 0.000000
## 2 0.000000
## 3 1.623178
# Merge train and test set
tmp = rbind(train_set,test_set)
tmp %>% dim()
## [1] 1247 3003
# Use numeric values only
all_data = tmp[,4:ncol(tmp)]
# predict
all_data = predict(preProcess(all_data, method = c("center", "scale", "YeoJohnson", "nzv")),all_data)
all_data[1:3,1:8]
## CCL4L2 HIST1H4C CXCL8 CTSL CCL4 CENPF SOD2
## 1 -0.5719783 -0.7074698 2.0459790 -0.316838 -1.0812885 -0.4392773 1.4017583
## 2 1.8158364 -0.7074698 -0.5411131 -0.316838 1.5401948 -0.4392773 -0.7865508
## 3 1.8112970 1.6333059 -0.5411131 2.100573 0.9865894 2.2856705 -0.7865508
## TNFRSF9
## 1 -0.3341243
## 2 -0.3341243
## 3 -0.3341243
rownames(all_data) = tmp$Row.names
all_data$sample = tmp$sample
all_data %>% dim()
## [1] 1247 698
all_data$sample =factor(all_data$sample, levels = c("CTL","NKC"))
#all_data$sample
train_set_processed = all_data[train_set$Row.names,]
test_set_processed = all_data[test_set$Row.names,]
set.seed(94512)
tsne <- Rtsne::Rtsne(all_data[,colnames(all_data)!='sample'], dims = 2, perplexity=15, verbose=TRUE, max_iter = 500)
# tsne %>% saveRDS(paste0(dir,"rds/ML.23.10.13.tsne.rds"))
# tsne$Y has tSNE X and Y information
tsne.df = as.data.frame(tsne$Y)
rownames(tsne.df) = rownames(all_data)
colnames(tsne.df) = c('tsne.1', 'tsne.2')
tsne.df$sample = all_data$sample
ggplot(tsne.df, aes(x=tsne.1, y=tsne.2, color=sample))+
geom_point()+
theme_bw()
data_for_clustering <- all_data[, colnames(all_data) != 'sample']
k <- 6
kmeans_result <- kmeans(data_for_clustering, centers = k, nstart = 5)
# check the result
#kmeans_result$cluster
# check the cluster center
#kmeans_result$centers
kmeans_result %>% saveRDS(paste0(dir,"rds/ML.23.10.13.kmeans_6.rds"))
clustered_tsne_data <- data.frame(Cluster = as.factor(kmeans_result$cluster), tsne.df)
# tSNE Plot
ggplot(clustered_tsne_data, aes(tsne.1,tsne.2, color = Cluster)) +
geom_point() +
labs(title = "t-SNE Clustering") +
theme_minimal()
ggplot(clustered_tsne_data, aes(tsne.1,tsne.2, color = Cluster)) +
geom_point() +
labs(title = "t-SNE Clustering") +
theme_minimal() +facet_wrap(.~Cluster, ncol = 3)
Random Forest: Train model
CTL-NKC classification
df2=train_set_processed
colnames(df2) = gsub("-", "_", colnames(df2))
sel=colnames(df2[,1:(ncol(df2)-1)])
# sel = sel[!(grepl("HLA", sel))]
# sel = sel[!(grepl("MT", sel))]
# sel = sel[!(grepl("EPB41L4A", sel))]
# Assuming 'sample' is the response variable and 'sel' is a character vector of predictor variables
response_var <- "sample"
predictor_vars <- sel
# Create the formula using reformulate
test_vars <- predictor_vars[1:620] # adjust based on your actual data
formula <- as.formula(paste(response_var, "~",
paste(test_vars, collapse = "+")))
# formula <- as.formula(paste(response_var, "~",
# paste(predictor_vars, collapse = "+")))
to_forest = df2
# train_control
train_control <- trainControl(method="cv", number=10, classProbs = TRUE, summaryFunction = twoClassSummary)
# Check the data frame structure and variable names
set.seed(123)
Sys.time()
rf2 <- train(formula, data = to_forest, method = "rf", tuneLength=10, importance=T, trControl=train_control, metric = "ROC")
# Sys.time()
#rf2 %>% saveRDS(paste0(dir, "rds/ML.23.10.13.rf2.rds"))
# Define the trainControl settings with the positive class as "NKC"
# train_control
train_control <- trainControl(method="cv", number=10, classProbs = TRUE,
summaryFunction = twoClassSummary)
# Train the random forest model with the specified trainControl
rf3 <- train(formula, data = to_forest, method = "rf", tuneLength=10, importance=T, trControl=train_control, metric = "ROC", positive = "NKC")
rf3 %>% saveRDS(paste0(dir, "rds/ML.23.10.13.rf3.NKC.positive.rds"))
rf2 = readRDS(paste0(dir, "rds/ML.23.10.13.rf2.rds"))
rf2
## Random Forest
##
## 935 samples
## 620 predictors
## 2 classes: 'CTL', 'NKC'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 841, 842, 842, 842, 841, 841, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8112668 0.008333333 0.9985714
## 3 0.8128744 0.012500000 0.9971429
## 7 0.8171752 0.042572464 0.9942857
## 13 0.8268715 0.046557971 0.9914286
## 25 0.8232656 0.054528986 0.9871222
## 48 0.8260115 0.092934783 0.9814079
## 91 0.8278610 0.101449275 0.9714079
## 173 0.8231108 0.114130435 0.9699793
## 327 0.8149763 0.140217391 0.9642650
## 620 0.8137224 0.148188406 0.9585507
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 91.
rf3 = readRDS(paste0(dir, "rds/ML.23.10.13.rf3.NKC.positive.rds"))
rf3
## Random Forest
##
## 935 samples
## 620 predictors
## 2 classes: 'CTL', 'NKC'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 841, 841, 841, 842, 842, 841, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 2 0.8186980 0.004166667 0.9985714
## 3 0.8220146 0.029710145 0.9928571
## 7 0.8253601 0.021195652 0.9928364
## 13 0.8312241 0.034239130 0.9899793
## 25 0.8306692 0.054891304 0.9856936
## 48 0.8348781 0.076449275 0.9799793
## 91 0.8359281 0.114130435 0.9785507
## 173 0.8329670 0.139130435 0.9714079
## 327 0.8312442 0.152355072 0.9756936
## 620 0.8261274 0.190579710 0.9728364
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 91.
df3=test_set_processed
colnames(df3) = gsub("-", "_", colnames(df3))
# Test the accuracy of the final model on the test set
prediction = predict(rf2,newdata = df3)
postResample(prediction,df3$sample)
## Accuracy Kappa
## 0.7403846 0.1474835
confusionMatrix(prediction,df3$sample)
## Confusion Matrix and Statistics
##
## Reference
## Prediction CTL NKC
## CTL 11 3
## NKC 78 220
##
## Accuracy : 0.7404
## 95% CI : (0.688, 0.7881)
## No Information Rate : 0.7147
## P-Value [Acc > NIR] : 0.1738
##
## Kappa : 0.1475
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.12360
## Specificity : 0.98655
## Pos Pred Value : 0.78571
## Neg Pred Value : 0.73826
## Prevalence : 0.28526
## Detection Rate : 0.03526
## Detection Prevalence : 0.04487
## Balanced Accuracy : 0.55507
##
## 'Positive' Class : CTL
##
##Random Forest most important features
roc_imp_rf <- varImp(rf2, scale = FALSE)
plot(roc_imp_rf, top=30)
exp.mtx.tsne = cbind(tsne.df[rownames(exp.mtx),], exp.mtx)
exp.mtx.tsne %>% ggplot(aes(tsne.1, tsne.2, color=GZMA)) + geom_point()
var_importance <- varImp(rf3, scale = FALSE)
plot(var_importance, top=30)
Use all_data from above
# Use numeric values only
all_data = tmp[,4:ncol(tmp)]
# predict
all_data = predict(preProcess(all_data, method = c("center", "scale", "YeoJohnson", "nzv")),all_data)
all_data[1:3,1:8]
rownames(all_data) = tmp$Row.names
all_data$sample = tmp$sample
all_data %>% dim()
all_data$sample =factor(all_data$sample, levels = c("CTL","NKC"))
# I can't use DESeq2 because I only have a scaled data
# Load DESeq2 library
library(DESeq2)
# Create a DESeqDataSet
dds <- DESeqDataSetFromMatrix(countData, colData, design = ~ cluster)
# Run differential expression analysis
dds <- DESeq(dds)
# Extract results
res <- results(dds)
# Filter for significant genes
significant_genes <- res[which(res$padj < 0.05), ]
# Get top markers for each cluster
top_markers_per_cluster <- by(significant_genes, significant_genes$cluster, function(cluster_data) {
top_n(cluster_data, n = 10, wt = abs(log2FoldChange))
})
scaled_data= all_data[names(kmeans_result$cluster),]
scaled_data$cluster = kmeans_result$cluster
scaled_data$cluster = factor(scaled_data$cluster)
levels(scaled_data$cluster) = c(rep("endogenous_NK",3), rep("additional_NK",3))
# Filter the data for the two clusters
cols=colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]
endogenous_NK_data <- scaled_data[scaled_data$cluster == "endogenous_NK", ][,colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]]
additional_NK_data <- scaled_data[scaled_data$cluster == "additional_NK", ][,colnames(scaled_data)[!(colnames(scaled_data) %in% c('cluster','sample'))]]
differentially_expressed_genes <- data.frame(Gene = character(0), P_Value = numeric(0), MeanDifference = numeric(0))
for (gene in cols) {
endo_mean <- mean(endogenous_NK_data[, gene])
add_mean <- mean(additional_NK_data[, gene])
p_value <- wilcox.test(endogenous_NK_data[, gene], additional_NK_data[, gene])$p.value
if (p_value < 0.05) { # You can adjust the significance threshold (e.g., 0.05)
mean_difference <- endo_mean - add_mean
differentially_expressed_genes <- rbind(differentially_expressed_genes,
data.frame(Gene = gene, P_Value = p_value, MeanDifference = mean_difference))
}
}
differentially_expressed_genes %>% arrange(MeanDifference) %>% head(10)
## Gene P_Value MeanDifference
## 1 ALOX5AP 1.329318e-89 -1.1136153
## 2 CD52 5.402229e-81 -1.1060831
## 3 CTSW 1.246408e-78 -1.0773024
## 4 CCL3 8.141245e-66 -0.9704443
## 5 CCL4L2 6.043726e-61 -0.9429490
## 6 JAML 1.139564e-62 -0.9269700
## 7 GZMA 6.287761e-56 -0.9094653
## 8 SH3BGRL3 2.415529e-57 -0.9023268
## 9 CEBPD 1.102471e-48 -0.8792903
## 10 FCER1G 7.346751e-49 -0.8531323
differentially_expressed_genes %>% filter(MeanDifference > 0.1) %>% arrange(P_Value)
## Gene P_Value MeanDifference
## 1 RPS18 2.003957e-74 0.9735399
## 2 RPS2 4.186645e-47 0.7800980
## 3 RPS4Y1 5.496090e-24 0.6161586
## 4 SPOCK2 4.766461e-19 0.5104610
## 5 ITGA4 9.778338e-18 0.5249422
## 6 EEF1B2 3.051736e-17 0.3639063
## 7 MALAT1 5.723633e-17 0.4500622
## 8 CXCR4 5.091805e-16 0.3945730
## 9 RPLP0 1.064833e-14 0.3834748
## 10 TMSB10 2.528810e-14 0.3793897
## 11 MIF 6.464242e-13 0.3426885
## 12 RPSA 8.527060e-13 0.3487072
## 13 DUSP2 2.785634e-11 0.4217843
## 14 GIMAP7 8.010904e-11 0.3171209
## 15 PDE4B 2.388929e-10 0.4040134
## 16 GZMH 2.898639e-08 0.2473633
## 17 NME2 2.259812e-07 0.2001636
## 18 ZNF331 4.200823e-07 0.3235077
## 19 HEBP2 6.443505e-07 0.3208917
## 20 GAS5 2.461599e-06 0.1819466
## 21 GZMM 4.559896e-06 0.3081114
## 22 PLAAT4 7.539248e-06 0.1719632
## 23 TCF7 1.361239e-05 0.2826387
## 24 ZFAS1 1.641383e-05 0.1289275
## 25 MT2A 1.092192e-04 0.1291319
## 26 NR4A2 1.407604e-04 0.2625901
## 27 HSP90AB1 1.854971e-04 0.1455816
## 28 CD7 5.884829e-04 0.1155778
## 29 TSPO 5.934343e-04 0.1083054
## 30 SNHG12 2.541000e-03 0.2285674
## 31 TUBB 2.646932e-03 0.1203508
## 32 ID3 3.987375e-03 0.2483297
## 33 TMIGD2 5.773546e-03 0.2035260
## 34 KDM6B 8.634898e-03 0.2097041
## 35 CD3G 8.877499e-03 0.1611099
## 36 PDCL3 1.349684e-02 0.1750981
## 37 CD83 1.689106e-02 0.2057964
## 38 IFITM3 2.161156e-02 0.2023153
## 39 MT1X 2.822768e-02 0.1874456
differentially_expressed_genes %>% filter(MeanDifference < - 0.5) %>% arrange(P_Value)
## Gene P_Value MeanDifference
## 1 ALOX5AP 1.329318e-89 -1.1136153
## 2 CD52 5.402229e-81 -1.1060831
## 3 CTSW 1.246408e-78 -1.0773024
## 4 CCL3 8.141245e-66 -0.9704443
## 5 JAML 1.139564e-62 -0.9269700
## 6 CCL4L2 6.043726e-61 -0.9429490
## 7 SH3BGRL3 2.415529e-57 -0.9023268
## 8 GZMA 6.287761e-56 -0.9094653
## 9 FCER1G 7.346751e-49 -0.8531323
## 10 CEBPD 1.102471e-48 -0.8792903
## 11 ACTB 3.451951e-46 -0.8066236
## 12 PAG1 3.930850e-46 -0.7927725
## 13 LIM2 9.898872e-45 -0.7665869
## 14 CCL5 4.836690e-44 -0.8218172
## 15 AC092821.3 1.351230e-43 -0.8283555
## 16 ARL4C 7.952528e-42 -0.8161295
## 17 SRGN 7.527401e-40 -0.7525193
## 18 IGFLR1 1.261123e-39 -0.8060718
## 19 CCL4 1.211623e-38 -0.7437139
## 20 S100A11 2.760035e-38 -0.7542336
## 21 TNFSF10 1.767410e-37 -0.7801545
## 22 SYNGR1 4.306390e-37 -0.6951116
## 23 GLRX 2.070608e-36 -0.6845145
## 24 XIST 3.421161e-36 -0.7560991
## 25 PLIN2 4.224885e-36 -0.7598861
## 26 HLA-DRA 4.644781e-36 -0.7579996
## 27 CEBPB 7.591118e-36 -0.7557146
## 28 CD82 7.650326e-36 -0.7455798
## 29 LAIR2 6.667963e-35 -0.6647056
## 30 IL2RA 9.219141e-35 -0.6868486
## 31 HOPX 2.188551e-34 -0.7622123
## 32 GZMB 6.319918e-34 -0.7231920
## 33 UCP2 1.517852e-33 -0.6644241
## 34 LGALS3 2.562817e-33 -0.7322306
## 35 MAF 3.983832e-33 -0.6602905
## 36 TPM4 7.982310e-32 -0.7059643
## 37 CTSB 3.532622e-31 -0.6330372
## 38 CLIC3 1.743288e-30 -0.6441608
## 39 CTSD 7.721000e-30 -0.7167413
## 40 CAPG 1.122687e-29 -0.7106018
## 41 HLA-DRB1 1.987496e-29 -0.7090800
## 42 RGS10 3.887982e-29 -0.6672097
## 43 MMP25 9.671099e-29 -0.5945201
## 44 HLA-DMA 2.030038e-28 -0.5937054
## 45 CPNE7 7.172844e-28 -0.5856930
## 46 CD74 1.038888e-27 -0.6471750
## 47 DRAXIN 1.496445e-27 -0.5697371
## 48 C12orf75 1.941916e-27 -0.6490073
## 49 TESC 1.006995e-26 -0.5978311
## 50 SQLE 4.863345e-26 -0.5717253
## 51 LCP1 6.693591e-26 -0.6429861
## 52 SMOX 7.679158e-26 -0.5614130
## 53 GEM 7.842933e-26 -0.5342845
## 54 CCL3L1 2.846188e-25 -0.5546991
## 55 TMEM14C 5.417853e-25 -0.5560812
## 56 HAVCR2 6.909213e-25 -0.5408489
## 57 ARHGAP18 9.171238e-25 -0.5660585
## 58 FCGR3A 4.150809e-24 -0.5120324
## 59 ENTPD1 1.916897e-23 -0.5138256
## 60 FTH1 4.967467e-23 -0.5201652
## 61 PPP1R15A 5.845569e-23 -0.6209233
## 62 IFI6 6.557328e-23 -0.5889474
## 63 CCND2 6.571843e-23 -0.6146277
## 64 BATF 6.463243e-22 -0.5116991
## 65 CD9 6.758444e-22 -0.5244289
## 66 AQP3 7.430447e-22 -0.5139439
## 67 VIM 9.786747e-22 -0.5469913
## 68 ANKRD28 1.080723e-21 -0.5041171
## 69 NCF4 1.466033e-21 -0.5264441
## 70 PGD 1.506905e-21 -0.5127455
## 71 RGS16 1.720735e-21 -0.5683245
## 72 TOX2 6.068589e-21 -0.5108969
## 73 MT-CO3 2.508376e-20 -0.5251673
## 74 LMNA 7.969750e-20 -0.5693821
## 75 SLC1A4 2.175407e-19 -0.5001452
## 76 ACTG1 2.672603e-19 -0.5414504
## 77 TIGIT 9.578690e-19 -0.5920736
## 78 BST2 1.439517e-18 -0.5561233
## 79 CORO1A 8.397181e-18 -0.5707347
## 80 HDLBP 1.907671e-17 -0.5403632
## 81 HMGB2 5.964086e-17 -0.5163635
## 82 MRPL52 3.024085e-16 -0.5104916
## 83 IER3 1.050899e-14 -0.5136831
## 84 KLRC2 2.076450e-14 -0.5011354
## 85 MYL12A 6.133341e-14 -0.5062116
differentially_expressed_genes %>% ggplot(aes(MeanDifference)) + geom_density()
differentially_expressed_genes$MeanDifference %>% summary()
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.1136 -0.3974 -0.2486 -0.2659 -0.1330 0.9735
endogenous_NK_rows <- scaled_data[scaled_data$cluster == "endogenous_NK", c('cluster','sample')] %>% rownames()
additional_NK_rows <- scaled_data[scaled_data$cluster == "additional_NK", c('cluster','sample')] %>% rownames()
obj.srt = readRDS(paste0(dir, "rds/Takeda.NK.30293.30298.NK_only.23.09.28.rds"))
obj.srt@meta.data$NK = ""
obj.srt@meta.data[endogenous_NK_rows,]$NK = "Endogenous_NK from CTL and NKC"
obj.srt@meta.data[additional_NK_rows,]$NK = "Additional_NK from NKC"
DimPlot(obj.srt, group.by = "NK", cols = c("grey","red","blue"))
DimPlot(obj.srt, group.by = "NK", cols = c("grey","red","blue"), split.by = "sample", ncol = 2)
genes = c("NKG7", "KLRC1", "KLRD1", "KLRB1")
FeaturePlot(obj.srt, genes, pt.size = 0.2)
VlnPlot(obj.srt, features = genes, group.by = "NK", stack = T, flip = F)
genes = rownames(obj.srt)[grepl("GZM", rownames(obj.srt))]
FeaturePlot(obj.srt, genes, pt.size = 0.2, ncol = 3)
VlnPlot(obj.srt, features = genes, group.by = "NK", stack = T, flip = F)
# Find markers between NKs
g2 = "Additional_NK from NKC"
g1 = "Endogenous_NK from CTL and NKC"
logfc=log2(1)
Idents(obj.srt) = 'NK'
logfc=log2(1)
mks =FindMarkers(obj.srt, ident.1 = g2, ident.2 = g1,
logfc.threshold = logfc)
pval=0.05
fc=1.2
mks = mks %>% mutate(DE=ifelse(avg_log2FC >= log2(fc) & p_val_adj < pval, 'UP',
ifelse(avg_log2FC <= -log2(fc) & p_val_adj < pval, 'DN','no_sig')))
mks$DE = factor(mks$DE, levels = c('UP','DN','no_sig'))
mks$gene = rownames(mks)
mks =mks %>% mutate(labels= ifelse(DE == 'UP', gene, ifelse(DE=='DN',gene,'other')))
mks =mks %>% arrange(desc(avg_log2FC))
mks$gene <- rownames(mks)
mks =mks %>% mutate(labels= ifelse(DE == 'UP', gene, ifelse(DE=='DN',gene,'')))
mks =mks %>% arrange(desc(avg_log2FC))
mks %>% ggplot(aes(avg_log2FC, -log10(p_val_adj), color = DE)) +
geom_point(size = 1, alpha = 0.5) +
scale_color_manual(values = c("red",'blue', 'grey')) +
theme_classic() +
geom_vline(xintercept = c(-log2(fc), log2(fc)), color = 'grey') +
geom_hline(yintercept = -log10(0.05), color = 'grey') +
guides(colour = guide_legend(override.aes = list(size = 5))) +
geom_text(aes(label = labels), size = 3, show.legend = FALSE, hjust = 0, nudge_x = 0.03) +
ggeasy::easy_center_title() ## to center title
mks[mks$DE=="UP",]$gene
## [1] "REG4" "NTRK2" "UTS2" "MB" "DMD"
## [6] "SAXO2" "CYP1B1" "OLIG3" "CREB3L3" "LAIR2"
## [11] "CSF2RB" "CCL4L2" "LINC00892" "LIM2" "AC040970.1"
## [16] "HHLA3" "OTUB2" "APBB2" "MCEMP1" "MSC-AS1"
## [21] "JAML" "MB21D2" "NINJ2" "NPTX1" "WNT11"
## [26] "CCL3L1" "TJP1" "AMZ1" "AL139330.1" "RYR1"
## [31] "TFPI" "CEACAM1" "MYO1E" "CLEC12A" "ABCG1"
## [36] "PRSS57" "MAL" "STAP1" "CD109" "FUT7"
## [41] "CTLA4" "CLECL1" "LZTS1" "GPR35" "CD244"
## [46] "MMP25" "DRAXIN" "SMOX" "SMCO4" "DPF3"
## [51] "CCL3" "RCBTB2" "TMEM273" "ASB2" "PAX8-AS1"
## [56] "SYNGR1" "SIGLEC7" "SLC1A4" "LY6G5C" "MSC"
## [61] "PGLYRP2" "HYLS1" "NETO2" "LPAR6" "LRRC28"
## [66] "BCAR3" "NQO1" "PAG1" "GPAT3" "MAF"
## [71] "TGFBR3" "CSGALNACT1" "IL1RN" "UCP2" "TESC"
## [76] "IL2RA" "IL9R" "PRKAR1B" "CXorf40B" "IKZF4"
## [81] "CAMK1" "CPNE7" "NPDC1" "TOX2" "EPB41L2"
## [86] "RUNX2" "AQP3" "MTSS1" "CTSB" "NCF4"
## [91] "PLK1" "FDXR" "TBXAS1" "CORO1C" "CD52"
## [96] "ARHGEF12" "LAIR1" "LDB2" "AC017104.1" "FUT8"
## [101] "COPZ2" "NAPRT" "PTPRN2" "CAVIN3" "PGD"
## [106] "LUCAT1" "TNF" "FBXO32" "LINC00299" "DUSP6"
## [111] "HLA-DMA" "RITA1" "PACSIN2" "VAMP1" "C3orf14"
## [116] "ITGA2" "MPZL3" "DYRK4" "AC092821.3" "FNIP2"
## [121] "P2RX4" "FCGR3A" "ARHGAP18" "GNA12" "IFITM10"
## [126] "GLRX" "TFEB" "BUB1" "GEM" "PDGFA"
## [131] "EIPR1" "ITGB7" "PDLIM7" "CTSH" "CCDC50"
## [136] "TST" "ITPRIPL2" "CEBPD" "AGTRAP" "FEZ1"
## [141] "ZNF683" "CTSW" "PPARA" "LINC00539" "EMP1"
## [146] "NCALD" "KNL1" "CD70" "DPYSL2" "CCL4"
## [151] "RGS9" "SYTL2" "SQLE" "DOCK5" "WIPI1"
## [156] "CD82" "ITGAM" "GSN" "CEBPB" "TPX2"
## [161] "KIR2DL4" "GBA" "TRGC1" "HLA-DRA" "ALOX5AP"
## [166] "FADS1" "XIST" "CD226" "PRAF2" "H2AFJ"
## [171] "HMGB3" "CDKN1A" "GNGT2" "ENTPD1" "AC022706.1"
## [176] "CD9" "GRAMD1A" "MKI67" "SESTD1" "IFI6"
## [181] "LTC4S" "SLCO4A1" "ATP6V1B2" "TNFRSF9" "GLIPR1"
## [186] "IGFLR1" "GZMA" "INPP5F" "CD68" "ITGAE"
## [191] "BATF" "AC007384.1" "STRN3" "TOP2A" "SYNJ2"
## [196] "AFDN" "CPD" "ETHE1" "LGALS3" "CCR1"
## [201] "ZMIZ1" "TMEM14C" "CD59" "RGS10" "SLC27A2"
## [206] "FBXO6" "TPM4" "STX11" "NECTIN3" "PIK3R6"
## [211] "C12orf75" "CAPN12" "ANKRD28" "GPALPP1" "IER5L"
## [216] "PTPN6" "ASPM" "MMP24OS" "YPEL1" "ORAI3"
## [221] "CENPF" "ADCY3" "GALM" "TRIM13" "GYG1"
## [226] "IL18RAP" "WDR1" "RCSD1" "FCER1G" "SLFN12"
## [231] "TNFRSF1A" "TPST2" "CENPW" "ZNF720" "RGS16"
## [236] "NCR3" "RCAN1" "PLIN2" "ERI1" "FHL3"
## [241] "PIM1" "APOBR" "FDFT1" "SNX10" "HIST3H2A"
## [246] "NUMB" "CLIC3" "TNFSF10" "HOPX" "SCPEP1"
## [251] "ANTXR2" "ARID3B" "GPR68" "NABP2" "PYM1"
## [256] "FUCA1" "SAMHD1" "ARHGEF3" "TANK" "TFPT"
## [261] "HLA-DQA1" "HOXB4" "NSMCE1" "ARL4C" "HNRNPLL"
## [266] "TNFSF12" "NDFIP2" "PPP1R15A" "AP1S2" "RHBDF2"
## [271] "GRN" "SLC1A5" "FAM3C" "USB1" "GDE1"
## [276] "KLHL42" "SNX9" "SEPTIN1" "PLEK" "TSPAN14"
## [281] "ATF5" "TMEM65" "CTNNA1" "KLRC1" "CASP1"
## [286] "HAVCR2" "ECE1" "UBALD2" "ACADVL" "CHST11"
## [291] "HTATSF1" "SQOR" "PRKRIP1" "CD74" "TGFBR1"
## [296] "SNAP29" "ITGB2" "LAT2" "MIB1" "HDGFL3"
## [301] "CAPG" "CSF1" "TAGAP" "SFXN1" "PXK"
## [306] "PERP" "ZBTB43" "TMEM141" "FNDC3B" "PHLDA1"
## [311] "EZH2" "KCNAB2" "DUSP5" "SH3BGRL3" "RAB27A"
## [316] "OAS2" "HSH2D" "LMNA" "EBP" "CTSC"
## [321] "SEC61A1" "TEX30" "HLA-DRB1" "KMT5A" "TRAF5"
## [326] "ARL8B" "ICAM3" "CTSA" "HSDL2" "TNFAIP8"
## [331] "IL21R" "OSTF1" "HMOX1" "PRNP" "ZDHHC3"
## [336] "ENO2" "BST2" "HLA-DMB" "GNA15" "CLCN3"
## [341] "DPP4" "BCL2L11" "TIAM1" "IQGAP2" "CTSD"
## [346] "RAB1B" "HIPK1" "MXRA7" "LAPTM5" "CCND2"
## [351] "HMGB2" "RB1" "DCXR" "SLAMF7" "ARRB2"
## [356] "CYTH4" "HIPK2" "MRPL52" "RFLNB" "ZAP70"
## [361] "CITED2" "NQO2" "CD58" "HDLBP" "FAM192A"
## [366] "NDUFB3" "TYROBP" "GOLIM4" "CTBS" "GCNT1"
## [371] "NDUFS4" "TRAPPC5" "HCST" "NCOR2" "DNASE2"
## [376] "ACTB" "DUSP10" "UTP11" "NFIL3" "MPV17"
## [381] "PLD3" "HLA-DQB1" "QKI" "AP1S1" "LMNB1"
## [386] "P2RX5" "STOML2" "GNPTAB" "RAC2" "LCP1"
## [391] "NCAM1" "PLEKHF1" "NDUFA7" "PSTPIP1" "NAA38"
## [396] "PRPF6" "THUMPD3" "TSEN54" "SFXN3" "ATP8B4"
## [401] "CARHSP1" "CD48" "PLEC" "EI24" "MR1"
## [406] "ABI3" "SH3BP1" "IGF2R" "OSBPL8" "VMA21"
## [411] "SEPTIN11" "VPS25" "CHMP4B" "NAPA" "NUDT5"
## [416] "FURIN" "TRAPPC3" "KDELR1" "AHNAK" "DDAH2"
## [421] "S100A11" "CCL5" "MPST" "CSGALNACT2" "SRGN"
## [426] "MCM5" "MTHFD2" "XIAP" "CLIP1" "ARPC5"
## [431] "MDM2" "COMMD3" "PPP3CA" "COX11" "CD151"
## [436] "DBNL" "DAAM1" "WARS" "ERCC1" "TNFRSF1B"
## [441] "STAT3" "MATK" "PRDX6" "IFI27L2" "AKIRIN2"
## [446] "IFNG" "IDH2" "AC243829.4" "DENND1B" "CALCOCO2"
## [451] "VIM" "MT-CO2" "RHOC" "LAMTOR2" "SPTY2D1"
## [456] "CD3D" "SHMT2" "MAP3K8" "KLRC2" "MOB3A"
## [461] "ZBTB16" "PYCARD" "CST7" "MT-CO1" "DEF6"
## [466] "TFDP2" "MRPL28" "PIGX" "RNF167" "TRIP12"
## [471] "TIPARP" "CHMP5" "JOSD2" "BAX" "PCID2"
## [476] "HPRT1" "SAP30" "SNRPB2" "IER3" "FERMT3"
## [481] "ABHD2" "STAT1" "SPEN" "TMC6" "ZNHIT1"
## [486] "CTSS" "CMIP" "CD96" "FBXW11" "SOX4"
## [491] "GMFG" "GZMB" "C15orf40" "TALDO1" "TBC1D10C"
## [496] "ANKH" "PPP1R18" "GALNT2" "TMX4" "CD99"
## [501] "NDUFB1" "ANKRD11" "MYO9B" "DOK2" "DOCK8"
## [506] "SERINC3" "HELZ" "FASLG" "ANP32E" "ZDHHC20"
## [511] "DYNLL1" "CD164" "AP3D1" "NKG7" "RYBP"
## [516] "TRAF3IP3" "SECISBP2L" "ZFP36L1" "CD44" "IVNS1ABP"
## [521] "COMT" "JARID2" "RAB1A" "CAPZB" "MRPS11"
## [526] "TXK" "SLC6A6" "TAF12" "GPR171" "FLOT1"
## [531] "FMNL3" "CLTC" "ZNRD1" "MAPRE1" "PDE3B"
## [536] "SAT2" "CIAO1" "MOB1A" "HMOX2" "LCK"
## [541] "MEF2A" "DNPH1" "LAGE3" "UGP2" "RPS27L"
## [546] "ABHD17A" "OIP5-AS1" "CORO1A" "HM13" "SPN"
## [551] "STMN1" "TKT" "ATP5IF1" "HMGN2" "LGALS1"
## [556] "TM9SF2" "GUSB" "MRPS15" "GMCL1" "APOL6"
## [561] "RBM5" "ST8SIA4" "CBX5" "EMB" "MAPK1"
## [566] "SPATS2L" "EIF4G3" "CHCHD5" "RHOH" "MAD2L2"
## [571] "MRPS6" "DNM2" "LDLRAD4" "PRKCQ-AS1" "TIGIT"
## [576] "SAMSN1" "IP6K2" "ACTG1" "FNIP1" "DAD1"
## [581] "TLN1" "MZT2B" "INPP4A" "ITGAL" "RAB7A"
## [586] "NMRK1" "SRGAP3" "PICALM" "APOBEC3C" "ABRACL"
## [591] "DYNLRB1" "TMCO1" "PSMD14" "C6orf89" "MRPL34"
## [596] "NKAP" "RDX" "LPXN" "NCOA4" "TEX264"
## [601] "RIN3" "ATP5F1A" "ZFP91" "SMARCA4" "POLD4"
## [606] "TXN2" "COPRS" "RAB8B" "TOR1A" "ETFB"
## [611] "HIF1A" "PRKDC" "INTS6" "MSI2" "SQSTM1"
## [616] "PLSCR1" "TXNRD1" "CDC42SE1" "EIF2AK2" "COMMD4"
## [621] "LSP1" "UFD1" "SLC3A2" "ARL3" "CLTB"
## [626] "UBTF" "PSMB2" "CHD9" "NEMF" "EVL"
## [631] "MRPS7" "MIS18BP1" "GLO1" "RAP1A" "EFHD2"
## [636] "LYN" "FLNA" "DPM2" "MORF4L2" "SRI"
## [641] "CYTOR" "POLR3GL" "AOAH" "BHLHE40" "ANXA5"
## [646] "SLC9A3R1" "FAM107B" "RNF187" "SELENOH" "CTNNB1"
## [651] "HSD17B10" "MYL6" "SEPTIN2" "STIP1" "SLC16A3"
## [656] "PSMD4" "REEP5" "ZBTB7A" "DIAPH1" "GLUD1"
## [661] "TYMP" "ACTR3" "PGAM1" "USP9X" "ITM2A"
## [666] "KXD1" "RASAL3" "CXXC5" "UBE2V1" "RAB11FIP1"
## [671] "SLC5A3" "CD2BP2" "C7orf50" "ATP6V0D1" "PSAP"
## [676] "SASH3" "CARD16" "DYNC1H1" "LSM2" "SEC14L1"
## [681] "CSTB" "INPP5D" "MSN" "TMEM243" "SKAP1"
## [686] "SPAG9" "SLFN5" "APMAP" "FTH1" "OTULIN"
## [691] "ADSS" "GRK2" "GOPC" "MYL12A" "NLRC3"
## [696] "CRIP1" "NBL1" "SH3BGRL" "ATP5PB" "C17orf49"
## [701] "CCT8" "ANXA2" "TMEM258" "RBX1" "LINC01871"
## [706] "DDX6" "PARK7" "AHCYL1" "PRMT1" "TSG101"
## [711] "LAMP2" "PSENEN" "TMBIM6" "RTF2" "COPB2"
## [716] "ASAH1" "KLF13" "SERF2" "ITGB1" "H2AFV"
## [721] "ARPC1B" "BSG" "ITM2C" "CAP1" "CDC123"
## [726] "TIMP1" "GABARAPL2" "TIPRL" "NDFIP1" "IFNGR1"
## [731] "SIPA1L1" "EIF3A" "H3F3A" "CKLF" "SEPTIN9"
## [736] "CARD19" "UBE2F" "NDUFC2" "CLTA" "SEM1"
## [741] "GNG2" "XCL2" "OS9" "OCIAD2" "IDS"
## [746] "ATP5F1C" "MT-CYB" "ANXA11" "PPP1CA" "TRBC1"
## [751] "WAC" "PGLS" "LIMS1" "CLIC1" "GPX4"
## [756] "MBD2" "GNB2" "CHURC1" "PPP1R2" "SDF4"
## [761] "AP2M1" "NSA2" "MAP1LC3B" "CFL1" "CYTIP"
## [766] "ACTR2" "MT-ND4" "JPX" "DNAJC15" "RCN2"
## [771] "PCBP2" "DGUOK" "COMMD8" "MYO1F" "ARCN1"
## [776] "GRB2" "MT-CO3" "COX5A" "COX8A" "ARL6IP5"
## [781] "MRPL54" "NDUFS5" "DCTN3" "CSNK1A1" "C9orf16"
## [786] "CUTA" "PFN1" "MEAF6" "IL4R" "SDCBP"
## [791] "DNAJB6" "ATP5MF" "COX6B1" "TMOD3" "CD81"
## [796] "UBE2M" "MYH9" "CEP350" "MYADM" "FAM49B"
## [801] "ITM2B" "PTPN7" "MBNL1" "TSTD1" "PDAP1"
## [806] "PSMD8" "TNFRSF18" "CAPNS1" "KHDRBS1" "ZYX"
## [811] "PSMC4" "CD47" "ADAR" "CHMP2A" "MT-ATP6"
## [816] "FTL" "MACF1" "CD63" "COMMD6" "BRK1"
## [821] "BRD7" "EIF3K" "UBE2I" "TCEA1" "ACAP2"
## [826] "RALY" "COPE" "HSPA1A" "YWHAQ" "KAT6B"
## [831] "SMARCE1" "BTG2" "BOD1L1" "NDUFB6" "GNAI2"
## [836] "MCL1" "SNX3" "SPCS1" "ARID5A" "ARPC3"
## [841] "YME1L1" "NPM1" "CCSER2" "ATP6V0C" "OCIAD1"
## [846] "MRPL33" "AHI1" "SOCS1" "BLOC1S1" "ACAA2"
## [851] "CLK1" "CMPK1" "RBM25" "SSR3" "SRP14"
## [856] "LASP1" "VCP" "COMMD7" "IK" "ARPC2"
## [861] "RER1" "HMGN3" "ID2" "WDR83OS" "UQCRQ"
## [866] "PPP2R5C" "KPNB1" "NOP10" "SCAND1" "P4HB"
mks[mks$DE=="DN",]$gene
## [1] "RPL6" "RPS15A" "NACA" "RPS28" "RPL17" "RPL10A" "RPS7" "RPL27"
## [9] "RPL37" "MALAT1" "RPS15" "UBA52" "RPL11" "RPL15" "RPL18" "RPL24"
## [17] "EEF1A1" "HLA-B" "RPL35" "RPL36" "RPL3" "RPL14" "RPL29" "BTG1"
## [25] "HLA-A" "MIF" "RPLP2" "RPS3A" "TMSB10" "RPL22" "RPS13" "RPS9"
## [33] "EEF1G" "RPL38" "RPL35A" "RPSA" "RPS23" "RPL26" "RPS11" "RPLP1"
## [41] "FAU" "RPL8" "RPS3" "RPLP0" "RPS6" "RPL23A" "RPS21" "RPS20"
## [49] "RPL34" "RPL28" "RPL7A" "RPS5" "RPS25" "RPS27A" "RPS8" "NME2"
## [57] "RPL19" "RPS29" "RPL31" "RPL9" "RPL21" "RPL41" "RPL13" "RPL32"
## [65] "RPL30" "RPL37A" "RPS27" "RPL39" "RPS14" "RPS24" "RPL13A" "RPS2"
## [73] "EEF1B2" "RPL18A" "RPS18" "CXCR4" "EEF1D" "RPS12" "RPL10" "RPL36A"
## [81] "RPL27A" "PIK3R1" "GIMAP7" "GZMH" "HEBP2" "PDE4B" "DUSP2" "ITGA4"
## [89] "ZNF331" "SPOCK2" "KLF3" "LAYN" "IL7R" "IGFBP2" "RPS4Y1" "CXCR6"
## [97] "LAG3" "PRKY" "DDX3Y" "GZMK" "USP9Y" "EIF1AY" "CD8B"
mks %>% DT::datatable()